library(readr)
library(tibble)
library(tidyr)
library(dplyr)
library(tidytext)
library(ggplot2)
library(tm)
library(topicmodels)
library(stringr)
library(koRpus)
weeknotes_ryan <- as_tibble(read_csv('weeknotes_ryan.csv', col_names = FALSE))
names(weeknotes_ryan) <- c( "s02e01",
                            "s01e09",
                            "s01e08",
                            "s01e07",
                            "s01e06",
                            "s01e05",
                            "s01e04",
                            "s01e03",
                            "s01e02",      
                            "s01e01"
       )
weeknotes_ryan %>% 
  gather("episode", "text", 1:10) -> tidy_notes
tidy_notes
tidy_notes %>% 
  mutate(text2 = gsub("//.", " ", text)) -> tidier_notes
tidy_notes %>% 
  mutate(text2 = gsub("[.]", " ", text)) -> tidier_notes
tidier_notes %>% 
  unnest_tokens(word, text2) -> tidy_tokens
tidy_tokens
treetag(tidy_tokens$word,
        treetagger = "manual",
        format = "obj",
        TT.tknz = FALSE,
        lang = "en",
        TT.options = list(path = "../TreeTagger/", preset = "en")) -> tidy_tokens_tagged
Assuming 'UTF-8' as encoding for the input file. If the results turn out to be erroneous, check the file for invalid characters, e.g. em.dashes or fancy quotes, and/or consider setting 'encoding' manually.
Can't find the lexicon file, hence omitted! Please ensure this path is valid:
  ../TreeTagger//lib/english-lexicon.txt
as_tibble(tidy_tokens_tagged@TT.res)
data("stop_words")
other_stop_words <- c("i’d", "i’m", "i’ve")
tidy_tokens_tagged@TT.res %>%
  as_tibble() %>% 
  anti_join(stop_words, by = c("token" = "word")) %>% 
  filter(!(token %in% other_stop_words)) -> tidier_wo_stopwords
tidier_wo_stopwords %>% 
  filter(str_detect(token, regex("i’d", ignore_case = TRUE)))
tidier_wo_stopwords %>% 
  rename(word = token) -> tidier_wo_stopwords
tidier_wo_stopwords
tidier_wo_stopwords %>% 
  count(word, sort=TRUE)
tidier_wo_stopwords %>% 
  filter(episode == 's01e01') %>% 
  count(word, sort=TRUE)
tidier_wo_stopwords %>% 
  filter(episode == 's02e01') %>% 
  count(word, sort=TRUE)
tidier_wo_stopwords %>%
  count(word, sort = TRUE) %>%
  filter(n > 30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()
tidier_wo_stopwords
tidier_wo_stopwords %>%
  count(episode, word, sort = TRUE) %>%
  ungroup() -> kk

kk %>% 
  bind_tf_idf(word, episode, n) -> tidier_tf_idf 
tidier_tf_idf %>% 
  arrange(desc(tf_idf))
tidier_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) -> plot_episodes_tf_idf
plot_episodes_tf_idf %>% 
  group_by(episode) %>% 
  top_n(7) %>% 
  ungroup %>%
  ggplot(aes(word, tf_idf, fill = episode)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~episode, ncol = 2, scales = "free") +
  coord_flip()
tidier_notes %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) -> bigrams
bigrams %>%
  count(bigram, sort = TRUE)
bigrams_separated <- bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!word2 %in% other_stop_words) %>% 
  filter(!word2 %in% other_stop_words)
  

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
tidier_notes %>% 
  unnest_tokens(trigram, text, token = "ngrams", n = 3) -> trigrams

trigrams_separated <- trigrams %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ")

trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) %>% 
  filter(!word1 %in% other_stop_words) %>% 
  filter(!word3 %in% other_stop_words)
  

# new bigram counts:
trigram_counts <- trigrams_filtered %>% 
  count(word1, word2, word3, sort = TRUE)

trigram_counts
tidier_tf_idf %>% 
  cast_dtm(document = episode,term = word, value = n) -> dtm_matrix
dtm_matrix
tidier_tf_idf %>% 
  cast_sparse(episode, word, n) -> sparse_matrix
str(sparse_matrix)
txt_lda <- LDA(dtm_matrix, k = 5, control = list(seed = 1234))
txt_topics <- tidy(txt_lda, matrix = "beta")
top_terms <- txt_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme_bw()
LS0tCnRpdGxlOiAiV2Vla2x5IG5vdGVzIHRleHQgYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KHRpYmJsZSkKbGlicmFyeSh0aWR5cikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KHRvcGljbW9kZWxzKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkoa29ScHVzKQpgYGAKCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQp3ZWVrbm90ZXNfcnlhbiA8LSBhc190aWJibGUocmVhZF9jc3YoJ3dlZWtub3Rlc19yeWFuLmNzdicsIGNvbF9uYW1lcyA9IEZBTFNFKSkKbmFtZXMod2Vla25vdGVzX3J5YW4pIDwtIGMoICJzMDJlMDEiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgInMwMWUwOSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAiczAxZTA4IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICJzMDFlMDciLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgInMwMWUwNiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAiczAxZTA1IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICJzMDFlMDQiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgInMwMWUwMyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAiczAxZTAyIiwgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICJzMDFlMDEiCiAgICAgICApCmBgYAoKCgpgYGB7cn0Kd2Vla25vdGVzX3J5YW4gJT4lIAogIGdhdGhlcigiZXBpc29kZSIsICJ0ZXh0IiwgMToxMCkgLT4gdGlkeV9ub3RlcwpgYGAKCmBgYHtyfQp0aWR5X25vdGVzCmBgYAoKYGBge3J9CnRpZHlfbm90ZXMgJT4lIAogIG11dGF0ZSh0ZXh0MiA9IGdzdWIoIi8vLiIsICIgIiwgdGV4dCkpIC0+IHRpZGllcl9ub3RlcwpgYGAKCmBgYHtyfQp0aWR5X25vdGVzICU+JSAKICBtdXRhdGUodGV4dDIgPSBnc3ViKCJbLl0iLCAiICIsIHRleHQpKSAtPiB0aWRpZXJfbm90ZXMKYGBgCgpgYGB7cn0KdGlkaWVyX25vdGVzICU+JSAKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQyKSAtPiB0aWR5X3Rva2VucwpgYGAKCmBgYHtyfQp0aWR5X3Rva2VucwpgYGAKCmBgYHtyfQp0cmVldGFnKHRpZHlfdG9rZW5zJHdvcmQsCiAgICAgICAgdHJlZXRhZ2dlciA9ICJtYW51YWwiLAogICAgICAgIGZvcm1hdCA9ICJvYmoiLAogICAgICAgIFRULnRrbnogPSBGQUxTRSwKICAgICAgICBsYW5nID0gImVuIiwKICAgICAgICBUVC5vcHRpb25zID0gbGlzdChwYXRoID0gIi4uL1RyZWVUYWdnZXIvIiwgcHJlc2V0ID0gImVuIikpIC0+IHRpZHlfdG9rZW5zX3RhZ2dlZApgYGAKCmBgYHtyfQphc190aWJibGUodGlkeV90b2tlbnNfdGFnZ2VkQFRULnJlcykKYGBgCgpgYGB7cn0KZGF0YSgic3RvcF93b3JkcyIpCm90aGVyX3N0b3Bfd29yZHMgPC0gYygiaeKAmWQiLCAiaeKAmW0iLCAiaeKAmXZlIikKYGBgCgpgYGB7cn0KdGlkeV90b2tlbnNfdGFnZ2VkQFRULnJlcyAlPiUKICBhc190aWJibGUoKSAlPiUgCiAgYW50aV9qb2luKHN0b3Bfd29yZHMsIGJ5ID0gYygidG9rZW4iID0gIndvcmQiKSkgJT4lIAogIGZpbHRlcighKHRva2VuICVpbiUgb3RoZXJfc3RvcF93b3JkcykpIC0+IHRpZGllcl93b19zdG9wd29yZHMKYGBgCgoKYGBge3J9CnRpZGllcl93b19zdG9wd29yZHMgJT4lIAogIGZpbHRlcihzdHJfZGV0ZWN0KHRva2VuLCByZWdleCgiaeKAmWQiLCBpZ25vcmVfY2FzZSA9IFRSVUUpKSkKYGBgCmBgYHtyfQp0aWRpZXJfd29fc3RvcHdvcmRzICU+JSAKICByZW5hbWUod29yZCA9IHRva2VuKSAtPiB0aWRpZXJfd29fc3RvcHdvcmRzCmBgYAoKYGBge3J9CnRpZGllcl93b19zdG9wd29yZHMKYGBgCgoKCmBgYHtyfQp0aWRpZXJfd29fc3RvcHdvcmRzICU+JSAKICBjb3VudCh3b3JkLCBzb3J0PVRSVUUpCmBgYAoKYGBge3J9CnRpZGllcl93b19zdG9wd29yZHMgJT4lIAogIGZpbHRlcihlcGlzb2RlID09ICdzMDFlMDEnKSAlPiUgCiAgY291bnQod29yZCwgc29ydD1UUlVFKQpgYGAKCmBgYHtyfQp0aWRpZXJfd29fc3RvcHdvcmRzICU+JSAKICBmaWx0ZXIoZXBpc29kZSA9PSAnczAyZTAxJykgJT4lIAogIGNvdW50KHdvcmQsIHNvcnQ9VFJVRSkKYGBgCgpgYGB7cn0KdGlkaWVyX3dvX3N0b3B3b3JkcyAlPiUKICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkgJT4lCiAgZmlsdGVyKG4gPiAzMCkgJT4lCiAgbXV0YXRlKHdvcmQgPSByZW9yZGVyKHdvcmQsIG4pKSAlPiUKICBnZ3Bsb3QoYWVzKHdvcmQsIG4pKSArCiAgZ2VvbV9jb2woKSArCiAgeGxhYihOVUxMKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKYGBge3J9CnRpZGllcl93b19zdG9wd29yZHMKYGBgCgpgYGB7cn0KdGlkaWVyX3dvX3N0b3B3b3JkcyAlPiUKICBjb3VudChlcGlzb2RlLCB3b3JkLCBzb3J0ID0gVFJVRSkgJT4lCiAgdW5ncm91cCgpIC0+IGtrCgprayAlPiUgCiAgYmluZF90Zl9pZGYod29yZCwgZXBpc29kZSwgbikgLT4gdGlkaWVyX3RmX2lkZiAKYGBgCgpgYGB7cn0KdGlkaWVyX3RmX2lkZiAlPiUgCiAgYXJyYW5nZShkZXNjKHRmX2lkZikpCmBgYAoKYGBge3J9CnRpZGllcl90Zl9pZGYgJT4lCiAgYXJyYW5nZShkZXNjKHRmX2lkZikpICU+JQogIG11dGF0ZSh3b3JkID0gZmFjdG9yKHdvcmQsIGxldmVscyA9IHJldih1bmlxdWUod29yZCkpKSkgLT4gcGxvdF9lcGlzb2Rlc190Zl9pZGYKYGBgCgpgYGB7cn0KcGxvdF9lcGlzb2Rlc190Zl9pZGYgJT4lIAogIGdyb3VwX2J5KGVwaXNvZGUpICU+JSAKICB0b3Bfbig3KSAlPiUgCiAgdW5ncm91cCAlPiUKICBnZ3Bsb3QoYWVzKHdvcmQsIHRmX2lkZiwgZmlsbCA9IGVwaXNvZGUpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKwogIGZhY2V0X3dyYXAofmVwaXNvZGUsIG5jb2wgPSAyLCBzY2FsZXMgPSAiZnJlZSIpICsKICBjb29yZF9mbGlwKCkKYGBgCgpgYGB7cn0KdGlkaWVyX25vdGVzICU+JSAKICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpIC0+IGJpZ3JhbXMKYGBgCgpgYGB7cn0KYmlncmFtcyAlPiUKICBjb3VudChiaWdyYW0sIHNvcnQgPSBUUlVFKQpgYGAKCmBgYHtyfQpiaWdyYW1zX3NlcGFyYXRlZCA8LSBiaWdyYW1zICU+JQogIHNlcGFyYXRlKGJpZ3JhbSwgYygid29yZDEiLCAid29yZDIiKSwgc2VwID0gIiAiKQoKYmlncmFtc19maWx0ZXJlZCA8LSBiaWdyYW1zX3NlcGFyYXRlZCAlPiUKICBmaWx0ZXIoIXdvcmQxICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUKICBmaWx0ZXIoIXdvcmQyICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUgCiAgZmlsdGVyKCF3b3JkMiAlaW4lIG90aGVyX3N0b3Bfd29yZHMpICU+JSAKICBmaWx0ZXIoIXdvcmQyICVpbiUgb3RoZXJfc3RvcF93b3JkcykKICAKCiMgbmV3IGJpZ3JhbSBjb3VudHM6CmJpZ3JhbV9jb3VudHMgPC0gYmlncmFtc19maWx0ZXJlZCAlPiUgCiAgY291bnQod29yZDEsIHdvcmQyLCBzb3J0ID0gVFJVRSkKCmJpZ3JhbV9jb3VudHMKYGBgCgpgYGB7cn0KdGlkaWVyX25vdGVzICU+JSAKICB1bm5lc3RfdG9rZW5zKHRyaWdyYW0sIHRleHQsIHRva2VuID0gIm5ncmFtcyIsIG4gPSAzKSAtPiB0cmlncmFtcwoKdHJpZ3JhbXNfc2VwYXJhdGVkIDwtIHRyaWdyYW1zICU+JQogIHNlcGFyYXRlKHRyaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiwgIndvcmQzIiksIHNlcCA9ICIgIikKCnRyaWdyYW1zX2ZpbHRlcmVkIDwtIHRyaWdyYW1zX3NlcGFyYXRlZCAlPiUKICBmaWx0ZXIoIXdvcmQxICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUKICBmaWx0ZXIoIXdvcmQzICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUgCiAgZmlsdGVyKCF3b3JkMSAlaW4lIG90aGVyX3N0b3Bfd29yZHMpICU+JSAKICBmaWx0ZXIoIXdvcmQzICVpbiUgb3RoZXJfc3RvcF93b3JkcykKICAKCiMgbmV3IGJpZ3JhbSBjb3VudHM6CnRyaWdyYW1fY291bnRzIDwtIHRyaWdyYW1zX2ZpbHRlcmVkICU+JSAKICBjb3VudCh3b3JkMSwgd29yZDIsIHdvcmQzLCBzb3J0ID0gVFJVRSkKCnRyaWdyYW1fY291bnRzCmBgYAoKYGBge3J9CnRpZGllcl90Zl9pZGYgJT4lIAogIGNhc3RfZHRtKGRvY3VtZW50ID0gZXBpc29kZSx0ZXJtID0gd29yZCwgdmFsdWUgPSBuKSAtPiBkdG1fbWF0cml4CmBgYAoKYGBge3J9CmR0bV9tYXRyaXgKYGBgCgoKYGBge3J9CnRpZGllcl90Zl9pZGYgJT4lIAogIGNhc3Rfc3BhcnNlKGVwaXNvZGUsIHdvcmQsIG4pIC0+IHNwYXJzZV9tYXRyaXgKYGBgCgpgYGB7cn0Kc3RyKHNwYXJzZV9tYXRyaXgpCmBgYAoKYGBge3J9CnR4dF9sZGEgPC0gTERBKGR0bV9tYXRyaXgsIGsgPSA1LCBjb250cm9sID0gbGlzdChzZWVkID0gMTIzNCkpCmBgYAoKCmBgYHtyfQp0eHRfdG9waWNzIDwtIHRpZHkodHh0X2xkYSwgbWF0cml4ID0gImJldGEiKQp0b3BfdGVybXMgPC0gdHh0X3RvcGljcyAlPiUKICBncm91cF9ieSh0b3BpYykgJT4lCiAgdG9wX24oMTAsIGJldGEpICU+JQogIHVuZ3JvdXAoKSAlPiUKICBhcnJhbmdlKHRvcGljLCAtYmV0YSkKCnRvcF90ZXJtcyAlPiUKICBtdXRhdGUodGVybSA9IHJlb3JkZXIodGVybSwgYmV0YSkpICU+JQogIGdncGxvdChhZXModGVybSwgYmV0YSwgZmlsbCA9IGZhY3Rvcih0b3BpYykpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGZhY2V0X3dyYXAofiB0b3BpYywgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9idygpCmBgYAoK